# Tree Based Models
source('/Users/ssobrinou/IE/Advanced/2019_Advanced/Regression/code/load_libraries.R')
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday,
##     week, yday, year
## The following object is masked from 'package:base':
## 
##     date
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
source('/Users/ssobrinou/IE/Advanced/2019_Advanced/Regression/code/f_partition.R')
source('/Users/ssobrinou/IE/Advanced/2019_Advanced/Regression/code/regression_metrics.R')


whole_data<-f_partition(df=fread('/Users/ssobrinou/IE/Advanced/2019_Advanced/Datasets/data_automobile_ready.csv'),
                        test_proportion = 0.2,
                        seed = 872367823)

str(whole_data)
## List of 2
##  $ train:Classes 'data.table' and 'data.frame':  156 obs. of  31 variables:
##   ..$ fuel_gas          : int [1:156] 1 1 1 1 1 1 1 1 1 1 ...
##   ..$ aspiration_turbo  : int [1:156] 0 0 0 0 0 0 1 0 0 0 ...
##   ..$ doors_others      : int [1:156] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ doors_two         : int [1:156] 0 1 0 1 1 1 1 1 0 0 ...
##   ..$ body_others       : int [1:156] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ body_sedan        : int [1:156] 1 0 0 0 0 0 0 1 1 1 ...
##   ..$ body_wagon        : int [1:156] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ drive_others      : int [1:156] 0 0 0 0 0 0 0 0 1 0 ...
##   ..$ drive_rwd         : int [1:156] 0 0 0 1 1 0 1 1 0 0 ...
##   ..$ engine_loc_others : int [1:156] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ wheel_base        : num [1:156] 93.1 86.6 97.2 98.4 94.5 94.5 95.9 94.5 97 96.5 ...
##   ..$ length            : num [1:156] 167 145 173 176 169 ...
##   ..$ width             : num [1:156] 64.2 63.9 65.2 65.6 64 64 66.3 64 65.4 64 ...
##   ..$ height            : num [1:156] 54.1 50.8 54.7 52 52.6 51.4 50.2 52.6 54.3 54.5 ...
##   ..$ weight            : int [1:156] 1950 1819 2324 2714 2204 2221 2818 2169 2385 2010 ...
##   ..$ engine_type_others: int [1:156] 0 0 0 0 0 0 0 0 1 0 ...
##   ..$ cyl_others        : int [1:156] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ cyl_six           : int [1:156] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ engine_size       : int [1:156] 91 92 120 146 98 109 156 98 108 92 ...
##   ..$ fuel_sys_idi      : int [1:156] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ fuel_sys_mpfi     : int [1:156] 0 0 0 1 0 1 0 0 0 0 ...
##   ..$ fuel_sys_others   : int [1:156] 0 1 0 0 0 0 1 0 0 1 ...
##   ..$ bore              : num [1:156] 3.08 2.91 3.33 3.62 3.19 3.19 3.59 3.19 3.62 2.91 ...
##   ..$ stroke            : num [1:156] 3.15 3.41 3.47 3.5 3.03 3.4 3.86 3.03 2.64 3.41 ...
##   ..$ compr_ratio       : num [1:156] 9 9.2 8.5 9.3 9 8.5 7 9 9 9.2 ...
##   ..$ hp                : int [1:156] 68 76 97 116 70 90 145 70 82 76 ...
##   ..$ peak_rpm          : int [1:156] 5000 6000 5200 4800 4800 5500 5000 4800 4800 6000 ...
##   ..$ city_mpg          : int [1:156] 31 31 27 24 29 24 19 29 24 30 ...
##   ..$ high_mpg          : int [1:156] 38 38 34 30 34 29 24 34 25 34 ...
##   ..$ price             : int [1:156] 7395 6855 8949 11549 8238 9980 12764 8058 9233 7295 ...
##   ..$ make_agg_toyota   : int [1:156] 0 0 0 1 1 0 0 1 0 0 ...
##   ..- attr(*, ".internal.selfref")=<externalptr> 
##  $ test :Classes 'data.table' and 'data.frame':  39 obs. of  31 variables:
##   ..$ fuel_gas          : int [1:39] 1 1 1 1 1 1 1 1 1 1 ...
##   ..$ aspiration_turbo  : int [1:39] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ doors_others      : int [1:39] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ doors_two         : int [1:39] 0 0 0 1 1 0 0 1 1 0 ...
##   ..$ body_others       : int [1:39] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ body_sedan        : int [1:39] 1 0 1 0 0 0 1 0 0 1 ...
##   ..$ body_wagon        : int [1:39] 0 1 0 0 0 0 0 0 0 0 ...
##   ..$ drive_others      : int [1:39] 1 0 0 0 0 0 0 0 0 0 ...
##   ..$ drive_rwd         : int [1:39] 0 0 1 0 0 0 0 0 0 0 ...
##   ..$ engine_loc_others : int [1:39] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ wheel_base        : num [1:39] 99.4 105.8 103.5 88.4 93.7 ...
##   ..$ length            : num [1:39] 177 193 189 141 157 ...
##   ..$ width             : num [1:39] 66.4 71.4 66.9 60.3 63.8 63.8 63.8 64 65.2 65.2 ...
##   ..$ height            : num [1:39] 54.3 55.7 55.7 53.2 50.8 50.6 50.6 52.6 53.3 54.1 ...
##   ..$ weight            : int [1:39] 2824 2954 3230 1488 1876 1967 1989 1940 2289 2304 ...
##   ..$ engine_type_others: int [1:39] 0 0 0 1 0 0 0 0 0 0 ...
##   ..$ cyl_others        : int [1:39] 1 1 0 1 0 0 0 0 0 0 ...
##   ..$ cyl_six           : int [1:39] 0 0 1 0 0 0 0 0 0 0 ...
##   ..$ engine_size       : int [1:39] 136 136 209 61 90 90 90 92 110 110 ...
##   ..$ fuel_sys_idi      : int [1:39] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ fuel_sys_mpfi     : int [1:39] 1 1 1 0 0 0 0 0 0 0 ...
##   ..$ fuel_sys_others   : int [1:39] 0 0 0 0 0 0 0 1 1 1 ...
##   ..$ bore              : num [1:39] 3.19 3.19 3.62 2.91 2.97 2.97 2.97 2.91 3.15 3.15 ...
##   ..$ stroke            : num [1:39] 3.4 3.4 3.39 3.03 3.23 3.23 3.23 3.41 3.58 3.58 ...
##   ..$ compr_ratio       : num [1:39] 8 8.5 8 9.5 9.4 9.4 9.4 9.2 9 9 ...
##   ..$ hp                : int [1:39] 115 110 182 48 68 68 68 76 86 86 ...
##   ..$ peak_rpm          : int [1:39] 5500 5500 5400 5100 5500 5500 5500 6000 5800 5800 ...
##   ..$ city_mpg          : int [1:39] 18 19 16 47 31 31 31 30 27 27 ...
##   ..$ high_mpg          : int [1:39] 22 25 22 53 38 38 38 34 33 33 ...
##   ..$ price             : int [1:39] 17450 18920 30760 5151 6377 6229 6692 6529 9095 8845 ...
##   ..$ make_agg_toyota   : int [1:39] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, ".internal.selfref")=<externalptr>
whole_data<-lapply(whole_data, function(x){
  return(x[, which(sapply(x, is.integer)):=lapply(.SD, as.numeric), .SDcols=sapply(x,is.integer)])
})

str(whole_data)
## List of 2
##  $ train:Classes 'data.table' and 'data.frame':  156 obs. of  31 variables:
##   ..$ fuel_gas          : num [1:156] 1 1 1 1 1 1 1 1 1 1 ...
##   ..$ aspiration_turbo  : num [1:156] 0 0 0 0 0 0 1 0 0 0 ...
##   ..$ doors_others      : num [1:156] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ doors_two         : num [1:156] 0 1 0 1 1 1 1 1 0 0 ...
##   ..$ body_others       : num [1:156] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ body_sedan        : num [1:156] 1 0 0 0 0 0 0 1 1 1 ...
##   ..$ body_wagon        : num [1:156] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ drive_others      : num [1:156] 0 0 0 0 0 0 0 0 1 0 ...
##   ..$ drive_rwd         : num [1:156] 0 0 0 1 1 0 1 1 0 0 ...
##   ..$ engine_loc_others : num [1:156] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ wheel_base        : num [1:156] 93.1 86.6 97.2 98.4 94.5 94.5 95.9 94.5 97 96.5 ...
##   ..$ length            : num [1:156] 167 145 173 176 169 ...
##   ..$ width             : num [1:156] 64.2 63.9 65.2 65.6 64 64 66.3 64 65.4 64 ...
##   ..$ height            : num [1:156] 54.1 50.8 54.7 52 52.6 51.4 50.2 52.6 54.3 54.5 ...
##   ..$ weight            : num [1:156] 1950 1819 2324 2714 2204 ...
##   ..$ engine_type_others: num [1:156] 0 0 0 0 0 0 0 0 1 0 ...
##   ..$ cyl_others        : num [1:156] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ cyl_six           : num [1:156] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ engine_size       : num [1:156] 91 92 120 146 98 109 156 98 108 92 ...
##   ..$ fuel_sys_idi      : num [1:156] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ fuel_sys_mpfi     : num [1:156] 0 0 0 1 0 1 0 0 0 0 ...
##   ..$ fuel_sys_others   : num [1:156] 0 1 0 0 0 0 1 0 0 1 ...
##   ..$ bore              : num [1:156] 3.08 2.91 3.33 3.62 3.19 3.19 3.59 3.19 3.62 2.91 ...
##   ..$ stroke            : num [1:156] 3.15 3.41 3.47 3.5 3.03 3.4 3.86 3.03 2.64 3.41 ...
##   ..$ compr_ratio       : num [1:156] 9 9.2 8.5 9.3 9 8.5 7 9 9 9.2 ...
##   ..$ hp                : num [1:156] 68 76 97 116 70 90 145 70 82 76 ...
##   ..$ peak_rpm          : num [1:156] 5000 6000 5200 4800 4800 5500 5000 4800 4800 6000 ...
##   ..$ city_mpg          : num [1:156] 31 31 27 24 29 24 19 29 24 30 ...
##   ..$ high_mpg          : num [1:156] 38 38 34 30 34 29 24 34 25 34 ...
##   ..$ price             : num [1:156] 7395 6855 8949 11549 8238 ...
##   ..$ make_agg_toyota   : num [1:156] 0 0 0 1 1 0 0 1 0 0 ...
##   ..- attr(*, ".internal.selfref")=<externalptr> 
##  $ test :Classes 'data.table' and 'data.frame':  39 obs. of  31 variables:
##   ..$ fuel_gas          : num [1:39] 1 1 1 1 1 1 1 1 1 1 ...
##   ..$ aspiration_turbo  : num [1:39] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ doors_others      : num [1:39] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ doors_two         : num [1:39] 0 0 0 1 1 0 0 1 1 0 ...
##   ..$ body_others       : num [1:39] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ body_sedan        : num [1:39] 1 0 1 0 0 0 1 0 0 1 ...
##   ..$ body_wagon        : num [1:39] 0 1 0 0 0 0 0 0 0 0 ...
##   ..$ drive_others      : num [1:39] 1 0 0 0 0 0 0 0 0 0 ...
##   ..$ drive_rwd         : num [1:39] 0 0 1 0 0 0 0 0 0 0 ...
##   ..$ engine_loc_others : num [1:39] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ wheel_base        : num [1:39] 99.4 105.8 103.5 88.4 93.7 ...
##   ..$ length            : num [1:39] 177 193 189 141 157 ...
##   ..$ width             : num [1:39] 66.4 71.4 66.9 60.3 63.8 63.8 63.8 64 65.2 65.2 ...
##   ..$ height            : num [1:39] 54.3 55.7 55.7 53.2 50.8 50.6 50.6 52.6 53.3 54.1 ...
##   ..$ weight            : num [1:39] 2824 2954 3230 1488 1876 ...
##   ..$ engine_type_others: num [1:39] 0 0 0 1 0 0 0 0 0 0 ...
##   ..$ cyl_others        : num [1:39] 1 1 0 1 0 0 0 0 0 0 ...
##   ..$ cyl_six           : num [1:39] 0 0 1 0 0 0 0 0 0 0 ...
##   ..$ engine_size       : num [1:39] 136 136 209 61 90 90 90 92 110 110 ...
##   ..$ fuel_sys_idi      : num [1:39] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ fuel_sys_mpfi     : num [1:39] 1 1 1 0 0 0 0 0 0 0 ...
##   ..$ fuel_sys_others   : num [1:39] 0 0 0 0 0 0 0 1 1 1 ...
##   ..$ bore              : num [1:39] 3.19 3.19 3.62 2.91 2.97 2.97 2.97 2.91 3.15 3.15 ...
##   ..$ stroke            : num [1:39] 3.4 3.4 3.39 3.03 3.23 3.23 3.23 3.41 3.58 3.58 ...
##   ..$ compr_ratio       : num [1:39] 8 8.5 8 9.5 9.4 9.4 9.4 9.2 9 9 ...
##   ..$ hp                : num [1:39] 115 110 182 48 68 68 68 76 86 86 ...
##   ..$ peak_rpm          : num [1:39] 5500 5500 5400 5100 5500 5500 5500 6000 5800 5800 ...
##   ..$ city_mpg          : num [1:39] 18 19 16 47 31 31 31 30 27 27 ...
##   ..$ high_mpg          : num [1:39] 22 25 22 53 38 38 38 34 33 33 ...
##   ..$ price             : num [1:39] 17450 18920 30760 5151 6377 ...
##   ..$ make_agg_toyota   : num [1:39] 0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, ".internal.selfref")=<externalptr>
# we start defining a formula
formula<-as.formula(price~.)   # price against all other variables


#### 1.1 Base R Partitioning Tree 
library(rpart)
library(rpart.plot)
library(partykit)
## Loading required package: grid
## Loading required package: libcoin
## Loading required package: mvtnorm
tree_0<-rpart(formula = formula, data = whole_data$train, method = 'anova', model=TRUE, cp=0)

print(tree_0)
## n= 156 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 156 9507260000 13064.040  
##    2) engine_size< 182 146 3229339000 11427.030  
##      4) weight< 2544 89  442823600  8395.393  
##        8) length< 172.7 63   86829530  7401.762  
##         16) weight< 2124 36   23223920  6694.861  
##           32) weight< 1947 15    5064828  6093.800 *
##           33) weight>=1947 21    8869173  7124.190  
##             66) hp< 68.5 8    3449536  6596.000 *
##             67) hp>=68.5 13    1814290  7449.231 *
##         17) weight>=2124 27   21630080  8344.296  
##           34) hp< 89 19    9388133  8031.526 *
##           35) hp>=89 8    5968915  9087.125 *
##        9) length>=172.7 26  143078300 10803.040  
##         18) peak_rpm< 5350 19   17366900  9743.421 *
##         19) peak_rpm>=5350 7   46474610 13679.140 *
##      5) weight>=2544 57  691334500 16160.630  
##       10) width< 68.6 50  491555400 15542.020  
##         20) hp< 118 27  142082300 13943.780  
##           40) weight< 2923.5 15   68462070 13016.530 *
##           41) weight>=2923.5 12   44602590 15102.830 *
##         21) hp>=118 23  199542200 17418.220  
##           42) stroke>=3.31 10   21436830 14821.100 *
##           43) stroke< 3.31 13   58770460 19416.000 *
##       11) width>=68.6 7   43973520 20579.290 *
##    3) engine_size>=182 10  174348500 36964.500 *
print(as.party(tree_0))
## 
## Model formula:
## price ~ fuel_gas + aspiration_turbo + doors_others + doors_two + 
##     body_others + body_sedan + body_wagon + drive_others + drive_rwd + 
##     engine_loc_others + wheel_base + length + width + height + 
##     weight + engine_type_others + cyl_others + cyl_six + engine_size + 
##     fuel_sys_idi + fuel_sys_mpfi + fuel_sys_others + bore + stroke + 
##     compr_ratio + hp + peak_rpm + city_mpg + high_mpg + make_agg_toyota
## 
## Fitted party:
## [1] root
## |   [2] engine_size < 182
## |   |   [3] weight < 2544
## |   |   |   [4] length < 172.7
## |   |   |   |   [5] weight < 2124
## |   |   |   |   |   [6] weight < 1947: 6093.800 (n = 15, err = 5064828.4)
## |   |   |   |   |   [7] weight >= 1947
## |   |   |   |   |   |   [8] hp < 68.5: 6596.000 (n = 8, err = 3449536.0)
## |   |   |   |   |   |   [9] hp >= 68.5: 7449.231 (n = 13, err = 1814290.3)
## |   |   |   |   [10] weight >= 2124
## |   |   |   |   |   [11] hp < 89: 8031.526 (n = 19, err = 9388132.7)
## |   |   |   |   |   [12] hp >= 89: 9087.125 (n = 8, err = 5968914.9)
## |   |   |   [13] length >= 172.7
## |   |   |   |   [14] peak_rpm < 5350: 9743.421 (n = 19, err = 17366898.6)
## |   |   |   |   [15] peak_rpm >= 5350: 13679.143 (n = 7, err = 46474610.9)
## |   |   [16] weight >= 2544
## |   |   |   [17] width < 68.6
## |   |   |   |   [18] hp < 118
## |   |   |   |   |   [19] weight < 2923.5: 13016.533 (n = 15, err = 68462073.7)
## |   |   |   |   |   [20] weight >= 2923.5: 15102.833 (n = 12, err = 44602589.7)
## |   |   |   |   [21] hp >= 118
## |   |   |   |   |   [22] stroke >= 3.31: 14821.100 (n = 10, err = 21436830.9)
## |   |   |   |   |   [23] stroke < 3.31: 19416.000 (n = 13, err = 58770458.0)
## |   |   |   [24] width >= 68.6: 20579.286 (n = 7, err = 43973521.4)
## |   [25] engine_size >= 182: 36964.500 (n = 10, err = 174348546.5)
## 
## Number of inner nodes:    12
## Number of terminal nodes: 13
objects(tree_0)
##  [1] "call"                "control"             "cptable"            
##  [4] "frame"               "functions"           "method"             
##  [7] "model"               "numresp"             "ordered"            
## [10] "parms"               "splits"              "terms"              
## [13] "variable.importance" "where"
tree_0$frame
##            var   n  wt        dev      yval   complexity ncompete
## 1  engine_size 156 156 9507260033 13064.045 0.6419906639        4
## 2       weight 146 146 3229339306 11427.027 0.2203769724        4
## 4       length  89  89  442823613  8395.393 0.0223950693        4
## 8       weight  63  63   86829529  7401.762 0.0044151031        4
## 16      weight  36  36   23223916  6694.861 0.0009771390        4
## 32      <leaf>  15  15    5064828  6093.800 0.0000000000        0
## 33          hp  21  21    8869173  7124.190 0.0003792204        4
## 66      <leaf>   8   8    3449536  6596.000 0.0000000000        0
## 67      <leaf>  13  13    1814290  7449.231 0.0000000000        0
## 17          hp  27  27   21630080  8344.296 0.0006598149        4
## 34      <leaf>  19  19    9388133  8031.526 0.0000000000        0
## 35      <leaf>   8   8    5968915  9087.125 0.0000000000        0
## 9     peak_rpm  26  26  143078337 10803.038 0.0083343495        4
## 18      <leaf>  19  19   17366899  9743.421 0.0000000000        0
## 19      <leaf>   7   7   46474611 13679.143 0.0000000000        0
## 5        width  57  57  691334511 16160.632 0.0163880597        4
## 10          hp  50  50  491555445 15542.020 0.0157701476        4
## 20      weight  27  27  142082315 13943.778 0.0030521571        4
## 40      <leaf>  15  15   68462074 13016.533 0.0000000000        0
## 41      <leaf>  12  12   44602590 15102.833 0.0000000000        0
## 21      stroke  23  23  199542236 17418.217 0.0125519810        4
## 42      <leaf>  10  10   21436831 14821.100 0.0000000000        0
## 43      <leaf>  13  13   58770458 19416.000 0.0000000000        0
## 11      <leaf>   7   7   43973521 20579.286 0.0000000000        0
## 3       <leaf>  10  10  174348546 36964.500 0.0000000000        0
##    nsurrogate
## 1           5
## 2           5
## 4           5
## 8           5
## 16          5
## 32          0
## 33          5
## 66          0
## 67          0
## 17          5
## 34          0
## 35          0
## 9           5
## 18          0
## 19          0
## 5           2
## 10          5
## 20          5
## 40          0
## 41          0
## 21          5
## 42          0
## 43          0
## 11          0
## 3           0
tree_0$control
## $minsplit
## [1] 20
## 
## $minbucket
## [1] 7
## 
## $cp
## [1] 0
## 
## $maxcompete
## [1] 4
## 
## $maxsurrogate
## [1] 5
## 
## $usesurrogate
## [1] 2
## 
## $surrogatestyle
## [1] 0
## 
## $maxdepth
## [1] 30
## 
## $xval
## [1] 10
tree_0$variable.importance
##      engine_size           weight               hp         city_mpg 
##       7956375152       6595368016       5939116695       5820053825 
##            width           length         high_mpg        drive_rwd 
##       3402342998       3313757603       1770753309       1323272325 
##             bore       wheel_base           height           stroke 
##        226203752        199613450        162772363        121137620 
##         peak_rpm          cyl_six    fuel_sys_mpfi  fuel_sys_others 
##         83157472         71706080         59667474         59667474 
##       cyl_others      compr_ratio        doors_two aspiration_turbo 
##         22257935          7017283          3096638          2352387
tree_0$cptable
##              CP nsplit  rel error    xerror       xstd
## 1  0.6419906639      0 1.00000000 1.0061701 0.19634783
## 2  0.2203769724      1 0.35800934 0.4096693 0.05526871
## 3  0.0223950693      2 0.13763236 0.1966299 0.03396699
## 4  0.0163880597      3 0.11523729 0.1784056 0.03210153
## 5  0.0157701476      4 0.09884923 0.1801067 0.02947649
## 6  0.0125519810      5 0.08307909 0.1762945 0.02932287
## 7  0.0083343495      6 0.07052711 0.1539249 0.02509774
## 8  0.0044151031      7 0.06219276 0.1418581 0.02354036
## 9  0.0030521571      8 0.05777765 0.1393956 0.02332931
## 10 0.0009771390      9 0.05472550 0.1373463 0.02303610
## 11 0.0006598149     10 0.05374836 0.1361763 0.02306136
## 12 0.0003792204     11 0.05308854 0.1354099 0.02308686
## 13 0.0000000000     12 0.05270932 0.1347064 0.02310826
plot(tree_0$cptable, type='b'); grid()

prune(tree_0, cp=max(tree_0$cptable[,'CP']))
## n= 156 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
## 1) root 156 9507260000 13064.04 *
prune(tree_0, cp=min(tree_0$cptable[,'CP']))
## n= 156 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 156 9507260000 13064.040  
##    2) engine_size< 182 146 3229339000 11427.030  
##      4) weight< 2544 89  442823600  8395.393  
##        8) length< 172.7 63   86829530  7401.762  
##         16) weight< 2124 36   23223920  6694.861  
##           32) weight< 1947 15    5064828  6093.800 *
##           33) weight>=1947 21    8869173  7124.190  
##             66) hp< 68.5 8    3449536  6596.000 *
##             67) hp>=68.5 13    1814290  7449.231 *
##         17) weight>=2124 27   21630080  8344.296  
##           34) hp< 89 19    9388133  8031.526 *
##           35) hp>=89 8    5968915  9087.125 *
##        9) length>=172.7 26  143078300 10803.040  
##         18) peak_rpm< 5350 19   17366900  9743.421 *
##         19) peak_rpm>=5350 7   46474610 13679.140 *
##      5) weight>=2544 57  691334500 16160.630  
##       10) width< 68.6 50  491555400 15542.020  
##         20) hp< 118 27  142082300 13943.780  
##           40) weight< 2923.5 15   68462070 13016.530 *
##           41) weight>=2923.5 12   44602590 15102.830 *
##         21) hp>=118 23  199542200 17418.220  
##           42) stroke>=3.31 10   21436830 14821.100 *
##           43) stroke< 3.31 13   58770460 19416.000 *
##       11) width>=68.6 7   43973520 20579.290 *
##    3) engine_size>=182 10  174348500 36964.500 *
# plotting the tree
# basic plot
plot(tree_0, uniform = T,branch=0.5,compress = T)
text(tree_0, cex=0.75)

# rpart.plot from the rpart.plot library
rpart.plot(tree_0,fallen.leaves = F)

rpart.plot(tree_0,fallen.leaves = T,box.palette = 'Gn')

# interactive tree plot
library(visNetwork)

visTree(tree_0)
visTree(tree_0, fallenLeaves = T,
        edgesFontSize = 14, 
        nodesFontSize = 16, 
        legend = T,
        colorVar = RColorBrewer::brewer.pal(12,'Paired'),
        colorEdges = 'darkgray',
        colorY = c('palegreen','tomato'),
        main='Regression Tree',
        tooltipDelay = 0.001,
        digits=0,
        minNodeSize=10, 
        highlightNearest = list(enabled = TRUE, hover = TRUE, algorithm = "hierarchical"),
        collapse = list(enabled = TRUE, fit = TRUE, resetHighlight = TRUE,
                        clusterOptions = list(fixed = F, physics = F)),
        nodesPopSize=T,
        edgesFontAlign = "horizontal")
# let's generate an NA in engine_size

whole_data$train_2<-copy(whole_data$train)
whole_data$train_2[27][['engine_size']]<-NA

sum(is.na(whole_data$train))
## [1] 0
sum(is.na(whole_data$train_2))
## [1] 1
tree_1<-rpart(formula = formula, data = whole_data$train_2, method = 'anova', model=TRUE, cp=0)

summary(tree_1)[[1]]
## Call:
## rpart(formula = formula, data = whole_data$train_2, method = "anova", 
##     model = TRUE, cp = 0)
##   n= 156 
## 
##              CP nsplit  rel error    xerror       xstd
## 1  0.6419906639      0 1.00000000 1.0100025 0.19821010
## 2  0.2203769724      1 0.35800934 0.4066895 0.05483298
## 3  0.0223950693      2 0.13763236 0.1762818 0.03182151
## 4  0.0163880597      3 0.11523729 0.1639441 0.03109905
## 5  0.0157701476      4 0.09884923 0.1503590 0.02296524
## 6  0.0125519810      5 0.08307909 0.1543844 0.02286437
## 7  0.0083343495      6 0.07052711 0.1507823 0.02255946
## 8  0.0044151031      7 0.06219276 0.1403067 0.02153874
## 9  0.0030521571      8 0.05777765 0.1392559 0.02165782
## 10 0.0009771390      9 0.05472550 0.1311765 0.02047493
## 11 0.0006598149     10 0.05374836 0.1296058 0.02041857
## 12 0.0003792204     11 0.05308854 0.1299522 0.02040772
## 13 0.0000000000     12 0.05270932 0.1294686 0.02042537
## 
## Variable importance
## engine_size      weight          hp    city_mpg       width      length 
##          21          18          16          16           9           9 
##    high_mpg   drive_rwd        bore  wheel_base 
##           5           4           1           1 
## 
## Node number 1: 156 observations,    complexity param=0.6419907
##   mean=13064.04, MSE=6.094397e+07 
##   left son=2 (146 obs) right son=3 (10 obs)
##   Primary splits:
##       engine_size < 182    to the left,  improve=0.6403823, (1 missing)
##       city_mpg    < 17.5   to the right, improve=0.5479412, (0 missing)
##       hp          < 175.5  to the left,  improve=0.5262288, (0 missing)
##       weight      < 2697.5 to the left,  improve=0.5036809, (0 missing)
##       high_mpg    < 28.5   to the right, improve=0.4750277, (0 missing)
##   Surrogate splits:
##       weight   < 3495   to the left,  agree=0.981, adj=0.7, (1 split)
##       hp       < 175.5  to the left,  agree=0.981, adj=0.7, (0 split)
##       city_mpg < 16.5   to the right, agree=0.981, adj=0.7, (0 split)
##       length   < 199.05 to the left,  agree=0.968, adj=0.5, (0 split)
##       width    < 69.25  to the left,  agree=0.968, adj=0.5, (0 split)
## 
## Node number 2: 146 observations,    complexity param=0.220377
##   mean=11427.03, MSE=2.211876e+07 
##   left son=4 (89 obs) right son=5 (57 obs)
##   Primary splits:
##       weight      < 2544   to the left,  improve=0.6487956, (0 missing)
##       high_mpg    < 28.5   to the right, improve=0.5943413, (0 missing)
##       engine_size < 126    to the left,  improve=0.5649050, (1 missing)
##       hp          < 94.5   to the left,  improve=0.5402361, (0 missing)
##       city_mpg    < 23.5   to the right, improve=0.4989328, (0 missing)
##   Surrogate splits:
##       high_mpg    < 28.5   to the right, agree=0.911, adj=0.772, (0 split)
##       engine_size < 126    to the left,  agree=0.890, adj=0.719, (0 split)
##       hp          < 104    to the left,  agree=0.877, adj=0.684, (0 split)
##       city_mpg    < 22     to the right, agree=0.863, adj=0.649, (0 split)
##       drive_rwd   < 0.5    to the left,  agree=0.856, adj=0.632, (0 split)
## 
## Node number 3: 10 observations
##   mean=36964.5, MSE=1.743485e+07 
## 
## Node number 4: 89 observations,    complexity param=0.02239507
##   mean=8395.393, MSE=4975546 
##   left son=8 (63 obs) right son=9 (26 obs)
##   Primary splits:
##       length     < 172.7  to the left,  improve=0.4808139, (0 missing)
##       weight     < 2287.5 to the left,  improve=0.4666525, (0 missing)
##       wheel_base < 98.6   to the left,  improve=0.4051903, (0 missing)
##       width      < 64.5   to the left,  improve=0.4023945, (0 missing)
##       hp         < 83     to the left,  improve=0.3833154, (0 missing)
##   Surrogate splits:
##       wheel_base  < 97.85  to the left,  agree=0.921, adj=0.731, (0 split)
##       weight      < 2301   to the left,  agree=0.910, adj=0.692, (0 split)
##       width       < 65.55  to the left,  agree=0.876, adj=0.577, (0 split)
##       engine_size < 115.5  to the left,  agree=0.876, adj=0.577, (0 split)
##       bore        < 3.29   to the left,  agree=0.831, adj=0.423, (0 split)
## 
## Node number 5: 57 observations,    complexity param=0.01638806
##   mean=16160.63, MSE=1.212868e+07 
##   left son=10 (50 obs) right son=11 (7 obs)
##   Primary splits:
##       width      < 68.6   to the left,  improve=0.2253693, (0 missing)
##       hp         < 118    to the left,  improve=0.2057451, (0 missing)
##       cyl_six    < 0.5    to the left,  improve=0.1899323, (0 missing)
##       wheel_base < 100.8  to the left,  improve=0.1879356, (0 missing)
##       weight     < 2697.5 to the left,  improve=0.1791694, (0 missing)
##   Surrogate splits:
##       wheel_base < 108.55 to the left,  agree=0.895, adj=0.143, (0 split)
##       cyl_others < 0.5    to the left,  agree=0.895, adj=0.143, (0 split)
## 
## Node number 8: 63 observations,    complexity param=0.004415103
##   mean=7401.762, MSE=1378246 
##   left son=16 (36 obs) right son=17 (27 obs)
##   Primary splits:
##       weight      < 2124   to the left,  improve=0.4834246, (0 missing)
##       engine_size < 94.5   to the left,  improve=0.4310051, (1 missing)
##       length      < 165.45 to the left,  improve=0.3917216, (0 missing)
##       city_mpg    < 29.5   to the right, improve=0.3752696, (0 missing)
##       hp          < 80     to the left,  improve=0.3258077, (0 missing)
##   Surrogate splits:
##       engine_size < 97.5   to the left,  agree=0.889, adj=0.741, (0 split)
##       city_mpg    < 29.5   to the right, agree=0.889, adj=0.741, (0 split)
##       hp          < 77     to the left,  agree=0.857, adj=0.667, (0 split)
##       high_mpg    < 36.5   to the right, agree=0.841, adj=0.630, (0 split)
##       length      < 165.65 to the left,  agree=0.825, adj=0.593, (0 split)
## 
## Node number 9: 26 observations,    complexity param=0.008334349
##   mean=10803.04, MSE=5503013 
##   left son=18 (19 obs) right son=19 (7 obs)
##   Primary splits:
##       peak_rpm      < 5350   to the left,  improve=0.5538003, (0 missing)
##       hp            < 99     to the left,  improve=0.3699824, (0 missing)
##       engine_size   < 115    to the right, improve=0.2830159, (0 missing)
##       fuel_sys_mpfi < 0.5    to the left,  improve=0.2509445, (0 missing)
##       length        < 176.4  to the left,  improve=0.2469144, (0 missing)
##   Surrogate splits:
##       bore        < 3.23   to the right, agree=0.923, adj=0.714, (0 split)
##       engine_size < 115    to the right, agree=0.885, adj=0.571, (0 split)
##       hp          < 99     to the left,  agree=0.885, adj=0.571, (0 split)
##       city_mpg    < 23.5   to the right, agree=0.885, adj=0.571, (0 split)
##       high_mpg    < 29.5   to the right, agree=0.885, adj=0.571, (0 split)
## 
## Node number 10: 50 observations,    complexity param=0.01577015
##   mean=15542.02, MSE=9831109 
##   left son=20 (27 obs) right son=21 (23 obs)
##   Primary splits:
##       hp          < 118    to the left,  improve=0.3050132, (0 missing)
##       engine_size < 162.5  to the left,  improve=0.2531597, (0 missing)
##       cyl_six     < 0.5    to the left,  improve=0.2348309, (0 missing)
##       weight      < 2697.5 to the left,  improve=0.1646924, (0 missing)
##       peak_rpm    < 4375   to the right, improve=0.1423281, (0 missing)
##   Surrogate splits:
##       city_mpg    < 20.5   to the right, agree=0.86, adj=0.696, (0 split)
##       engine_size < 154    to the left,  agree=0.84, adj=0.652, (0 split)
##       cyl_six     < 0.5    to the left,  agree=0.76, adj=0.478, (0 split)
##       high_mpg    < 26.5   to the right, agree=0.76, adj=0.478, (0 split)
##       height      < 54.85  to the right, agree=0.74, adj=0.435, (0 split)
## 
## Node number 11: 7 observations
##   mean=20579.29, MSE=6281932 
## 
## Node number 16: 36 observations,    complexity param=0.000977139
##   mean=6694.861, MSE=645108.8 
##   left son=32 (15 obs) right son=33 (21 obs)
##   Primary splits:
##       weight      < 1947   to the left,  improve=0.4000150, (0 missing)
##       hp          < 68.5   to the left,  improve=0.3353818, (0 missing)
##       length      < 160.75 to the left,  improve=0.3168454, (0 missing)
##       bore        < 3.065  to the left,  improve=0.3050193, (0 missing)
##       engine_size < 94.5   to the left,  improve=0.2602266, (0 missing)
##   Surrogate splits:
##       city_mpg    < 36     to the right, agree=0.750, adj=0.400, (0 split)
##       high_mpg    < 40     to the right, agree=0.750, adj=0.400, (0 split)
##       doors_two   < 0.5    to the right, agree=0.722, adj=0.333, (0 split)
##       bore        < 3.04   to the left,  agree=0.722, adj=0.333, (0 split)
##       compr_ratio < 9.405  to the right, agree=0.722, adj=0.333, (0 split)
## 
## Node number 17: 27 observations,    complexity param=0.0006598149
##   mean=8344.296, MSE=801114.1 
##   left son=34 (19 obs) right son=35 (8 obs)
##   Primary splits:
##       hp            < 89     to the left,  improve=0.2900143, (0 missing)
##       fuel_sys_mpfi < 0.5    to the left,  improve=0.1561566, (0 missing)
##       peak_rpm      < 5100   to the left,  improve=0.1321882, (0 missing)
##       weight        < 2287.5 to the left,  improve=0.1298636, (0 missing)
##       bore          < 3.23   to the right, improve=0.1232483, (0 missing)
##   Surrogate splits:
##       compr_ratio      < 8.05   to the right, agree=0.889, adj=0.625, (0 split)
##       peak_rpm         < 5375   to the left,  agree=0.889, adj=0.625, (0 split)
##       city_mpg         < 24.5   to the right, agree=0.852, adj=0.500, (0 split)
##       high_mpg         < 30.5   to the right, agree=0.852, adj=0.500, (0 split)
##       aspiration_turbo < 0.5    to the left,  agree=0.815, adj=0.375, (0 split)
## 
## Node number 18: 19 observations
##   mean=9743.421, MSE=914047.3 
## 
## Node number 19: 7 observations
##   mean=13679.14, MSE=6639230 
## 
## Node number 20: 27 observations,    complexity param=0.003052157
##   mean=13943.78, MSE=5262308 
##   left son=40 (15 obs) right son=41 (12 obs)
##   Primary splits:
##       weight      < 2923.5 to the left,  improve=0.20423130, (0 missing)
##       wheel_base  < 102.35 to the left,  improve=0.19255060, (0 missing)
##       compr_ratio < 9.405  to the left,  improve=0.17862940, (0 missing)
##       hp          < 96     to the right, improve=0.09875490, (0 missing)
##       length      < 186.65 to the left,  improve=0.08896211, (0 missing)
##   Surrogate splits:
##       length     < 186.65 to the left,  agree=0.926, adj=0.833, (0 split)
##       width      < 67.05  to the left,  agree=0.926, adj=0.833, (0 split)
##       height     < 56.15  to the left,  agree=0.926, adj=0.833, (0 split)
##       wheel_base < 102.35 to the left,  agree=0.889, adj=0.750, (0 split)
##       bore       < 3.66   to the left,  agree=0.778, adj=0.500, (0 split)
## 
## Node number 21: 23 observations,    complexity param=0.01255198
##   mean=17418.22, MSE=8675749 
##   left son=42 (10 obs) right son=43 (13 obs)
##   Primary splits:
##       stroke      < 3.31   to the right, improve=0.5980435, (0 missing)
##       high_mpg    < 24.5   to the left,  improve=0.3815231, (0 missing)
##       height      < 54.2   to the left,  improve=0.3073280, (0 missing)
##       compr_ratio < 7.65   to the left,  improve=0.2493117, (0 missing)
##       body_sedan  < 0.5    to the left,  improve=0.2394175, (0 missing)
##   Surrogate splits:
##       height          < 54.2   to the left,  agree=0.826, adj=0.6, (0 split)
##       fuel_sys_mpfi   < 0.5    to the left,  agree=0.783, adj=0.5, (0 split)
##       fuel_sys_others < 0.5    to the right, agree=0.783, adj=0.5, (0 split)
##       bore            < 3.29   to the left,  agree=0.783, adj=0.5, (0 split)
##       width           < 66.7   to the left,  agree=0.739, adj=0.4, (0 split)
## 
## Node number 32: 15 observations
##   mean=6093.8, MSE=337655.2 
## 
## Node number 33: 21 observations,    complexity param=0.0003792204
##   mean=7124.19, MSE=422341.6 
##   left son=66 (8 obs) right son=67 (13 obs)
##   Primary splits:
##       hp          < 68.5   to the left,  improve=0.4065032, (0 missing)
##       bore        < 3.065  to the left,  improve=0.3706976, (0 missing)
##       length      < 160.55 to the left,  improve=0.3437479, (0 missing)
##       engine_size < 94.5   to the left,  improve=0.3238951, (0 missing)
##       stroke      < 3.26   to the left,  improve=0.3157922, (0 missing)
##   Surrogate splits:
##       high_mpg    < 37.5   to the right, agree=0.952, adj=0.875, (0 split)
##       engine_size < 94.5   to the left,  agree=0.857, adj=0.625, (0 split)
##       bore        < 3.115  to the left,  agree=0.857, adj=0.625, (0 split)
##       height      < 51.7   to the left,  agree=0.810, adj=0.500, (0 split)
##       stroke      < 3.26   to the left,  agree=0.810, adj=0.500, (0 split)
## 
## Node number 34: 19 observations
##   mean=8031.526, MSE=494112.2 
## 
## Node number 35: 8 observations
##   mean=9087.125, MSE=746114.4 
## 
## Node number 40: 15 observations
##   mean=13016.53, MSE=4564138 
## 
## Node number 41: 12 observations
##   mean=15102.83, MSE=3716882 
## 
## Node number 42: 10 observations
##   mean=14821.1, MSE=2143683 
## 
## Node number 43: 13 observations
##   mean=19416, MSE=4520804 
## 
## Node number 66: 8 observations
##   mean=6596, MSE=431192 
## 
## Node number 67: 13 observations
##   mean=7449.231, MSE=139560.8
##            var   n  wt        dev      yval   complexity ncompete
## 1  engine_size 156 156 9507260033 13064.045 0.6419906639        4
## 2       weight 146 146 3229339306 11427.027 0.2203769724        4
## 4       length  89  89  442823613  8395.393 0.0223950693        4
## 8       weight  63  63   86829529  7401.762 0.0044151031        4
## 16      weight  36  36   23223916  6694.861 0.0009771390        4
## 32      <leaf>  15  15    5064828  6093.800 0.0000000000        0
## 33          hp  21  21    8869173  7124.190 0.0003792204        4
## 66      <leaf>   8   8    3449536  6596.000 0.0000000000        0
## 67      <leaf>  13  13    1814290  7449.231 0.0000000000        0
## 17          hp  27  27   21630080  8344.296 0.0006598149        4
## 34      <leaf>  19  19    9388133  8031.526 0.0000000000        0
## 35      <leaf>   8   8    5968915  9087.125 0.0000000000        0
## 9     peak_rpm  26  26  143078337 10803.038 0.0083343495        4
## 18      <leaf>  19  19   17366899  9743.421 0.0000000000        0
## 19      <leaf>   7   7   46474611 13679.143 0.0000000000        0
## 5        width  57  57  691334511 16160.632 0.0163880597        4
## 10          hp  50  50  491555445 15542.020 0.0157701476        4
## 20      weight  27  27  142082315 13943.778 0.0030521571        4
## 40      <leaf>  15  15   68462074 13016.533 0.0000000000        0
## 41      <leaf>  12  12   44602590 15102.833 0.0000000000        0
## 21      stroke  23  23  199542236 17418.217 0.0125519810        4
## 42      <leaf>  10  10   21436831 14821.100 0.0000000000        0
## 43      <leaf>  13  13   58770458 19416.000 0.0000000000        0
## 11      <leaf>   7   7   43973521 20579.286 0.0000000000        0
## 3       <leaf>  10  10  174348546 36964.500 0.0000000000        0
##    nsurrogate
## 1           5
## 2           5
## 4           5
## 8           5
## 16          5
## 32          0
## 33          5
## 66          0
## 67          0
## 17          5
## 34          0
## 35          0
## 9           5
## 18          0
## 19          0
## 5           2
## 10          5
## 20          5
## 40          0
## 41          0
## 21          5
## 42          0
## 43          0
## 11          0
## 3           0
print(as.party(tree_1))
## 
## Model formula:
## price ~ fuel_gas + aspiration_turbo + doors_others + doors_two + 
##     body_others + body_sedan + body_wagon + drive_others + drive_rwd + 
##     engine_loc_others + wheel_base + length + width + height + 
##     weight + engine_type_others + cyl_others + cyl_six + engine_size + 
##     fuel_sys_idi + fuel_sys_mpfi + fuel_sys_others + bore + stroke + 
##     compr_ratio + hp + peak_rpm + city_mpg + high_mpg + make_agg_toyota
## 
## Fitted party:
## [1] root
## |   [2] engine_size < 182
## |   |   [3] weight < 2544
## |   |   |   [4] length < 172.7
## |   |   |   |   [5] weight < 2124
## |   |   |   |   |   [6] weight < 1947: 6093.800 (n = 15, err = 5064828.4)
## |   |   |   |   |   [7] weight >= 1947
## |   |   |   |   |   |   [8] hp < 68.5: 6596.000 (n = 8, err = 3449536.0)
## |   |   |   |   |   |   [9] hp >= 68.5: 7449.231 (n = 13, err = 1814290.3)
## |   |   |   |   [10] weight >= 2124
## |   |   |   |   |   [11] hp < 89: 8031.526 (n = 19, err = 9388132.7)
## |   |   |   |   |   [12] hp >= 89: 9087.125 (n = 8, err = 5968914.9)
## |   |   |   [13] length >= 172.7
## |   |   |   |   [14] peak_rpm < 5350: 9743.421 (n = 19, err = 17366898.6)
## |   |   |   |   [15] peak_rpm >= 5350: 13679.143 (n = 7, err = 46474610.9)
## |   |   [16] weight >= 2544
## |   |   |   [17] width < 68.6
## |   |   |   |   [18] hp < 118
## |   |   |   |   |   [19] weight < 2923.5: 13016.533 (n = 15, err = 68462073.7)
## |   |   |   |   |   [20] weight >= 2923.5: 15102.833 (n = 12, err = 44602589.7)
## |   |   |   |   [21] hp >= 118
## |   |   |   |   |   [22] stroke >= 3.31: 14821.100 (n = 10, err = 21436830.9)
## |   |   |   |   |   [23] stroke < 3.31: 19416.000 (n = 13, err = 58770458.0)
## |   |   |   [24] width >= 68.6: 20579.286 (n = 7, err = 43973521.4)
## |   [25] engine_size >= 182: 36964.500 (n = 10, err = 174348546.5)
## 
## Number of inner nodes:    12
## Number of terminal nodes: 13
print(as.party(tree_0))
## 
## Model formula:
## price ~ fuel_gas + aspiration_turbo + doors_others + doors_two + 
##     body_others + body_sedan + body_wagon + drive_others + drive_rwd + 
##     engine_loc_others + wheel_base + length + width + height + 
##     weight + engine_type_others + cyl_others + cyl_six + engine_size + 
##     fuel_sys_idi + fuel_sys_mpfi + fuel_sys_others + bore + stroke + 
##     compr_ratio + hp + peak_rpm + city_mpg + high_mpg + make_agg_toyota
## 
## Fitted party:
## [1] root
## |   [2] engine_size < 182
## |   |   [3] weight < 2544
## |   |   |   [4] length < 172.7
## |   |   |   |   [5] weight < 2124
## |   |   |   |   |   [6] weight < 1947: 6093.800 (n = 15, err = 5064828.4)
## |   |   |   |   |   [7] weight >= 1947
## |   |   |   |   |   |   [8] hp < 68.5: 6596.000 (n = 8, err = 3449536.0)
## |   |   |   |   |   |   [9] hp >= 68.5: 7449.231 (n = 13, err = 1814290.3)
## |   |   |   |   [10] weight >= 2124
## |   |   |   |   |   [11] hp < 89: 8031.526 (n = 19, err = 9388132.7)
## |   |   |   |   |   [12] hp >= 89: 9087.125 (n = 8, err = 5968914.9)
## |   |   |   [13] length >= 172.7
## |   |   |   |   [14] peak_rpm < 5350: 9743.421 (n = 19, err = 17366898.6)
## |   |   |   |   [15] peak_rpm >= 5350: 13679.143 (n = 7, err = 46474610.9)
## |   |   [16] weight >= 2544
## |   |   |   [17] width < 68.6
## |   |   |   |   [18] hp < 118
## |   |   |   |   |   [19] weight < 2923.5: 13016.533 (n = 15, err = 68462073.7)
## |   |   |   |   |   [20] weight >= 2923.5: 15102.833 (n = 12, err = 44602589.7)
## |   |   |   |   [21] hp >= 118
## |   |   |   |   |   [22] stroke >= 3.31: 14821.100 (n = 10, err = 21436830.9)
## |   |   |   |   |   [23] stroke < 3.31: 19416.000 (n = 13, err = 58770458.0)
## |   |   |   [24] width >= 68.6: 20579.286 (n = 7, err = 43973521.4)
## |   [25] engine_size >= 182: 36964.500 (n = 10, err = 174348546.5)
## 
## Number of inner nodes:    12
## Number of terminal nodes: 13
whole_data$train_2[27]
##    fuel_gas aspiration_turbo doors_others doors_two body_others body_sedan
## 1:        1                0            0         1           0          0
##    body_wagon drive_others drive_rwd engine_loc_others wheel_base length
## 1:          0            1         0                 0       93.3  157.3
##    width height weight engine_type_others cyl_others cyl_six engine_size
## 1:  63.8   55.7   2240                  1          0       0          NA
##    fuel_sys_idi fuel_sys_mpfi fuel_sys_others bore stroke compr_ratio hp
## 1:            0             0               0 3.62   2.64         8.7 73
##    peak_rpm city_mpg high_mpg price make_agg_toyota
## 1:     4400       26       31  7603               0
predict(tree_1,whole_data$train_2[27])
##        1 
## 8031.526
# another type of partitioning algorithm: Conditional Inference Tree
library(partykit)
ctree_0<-ctree(formula, data = whole_data$train)
## Warning in min(diff(sort(ux))): ningún argumento finito para min;
## retornando Inf

## Warning in min(diff(sort(ux))): ningún argumento finito para min;
## retornando Inf

## Warning in min(diff(sort(ux))): ningún argumento finito para min;
## retornando Inf

## Warning in min(diff(sort(ux))): ningún argumento finito para min;
## retornando Inf

## Warning in min(diff(sort(ux))): ningún argumento finito para min;
## retornando Inf

## Warning in min(diff(sort(ux))): ningún argumento finito para min;
## retornando Inf

## Warning in min(diff(sort(ux))): ningún argumento finito para min;
## retornando Inf

## Warning in min(diff(sort(ux))): ningún argumento finito para min;
## retornando Inf

## Warning in min(diff(sort(ux))): ningún argumento finito para min;
## retornando Inf

## Warning in min(diff(sort(ux))): ningún argumento finito para min;
## retornando Inf

## Warning in min(diff(sort(ux))): ningún argumento finito para min;
## retornando Inf

## Warning in min(diff(sort(ux))): ningún argumento finito para min;
## retornando Inf

## Warning in min(diff(sort(ux))): ningún argumento finito para min;
## retornando Inf

## Warning in min(diff(sort(ux))): ningún argumento finito para min;
## retornando Inf

## Warning in min(diff(sort(ux))): ningún argumento finito para min;
## retornando Inf

## Warning in min(diff(sort(ux))): ningún argumento finito para min;
## retornando Inf

## Warning in min(diff(sort(ux))): ningún argumento finito para min;
## retornando Inf
print(ctree_0)
## 
## Model formula:
## price ~ fuel_gas + aspiration_turbo + doors_others + doors_two + 
##     body_others + body_sedan + body_wagon + drive_others + drive_rwd + 
##     engine_loc_others + wheel_base + length + width + height + 
##     weight + engine_type_others + cyl_others + cyl_six + engine_size + 
##     fuel_sys_idi + fuel_sys_mpfi + fuel_sys_others + bore + stroke + 
##     compr_ratio + hp + peak_rpm + city_mpg + high_mpg + make_agg_toyota
## 
## Fitted party:
## [1] root
## |   [2] engine_size <= 181
## |   |   [3] weight <= 2540
## |   |   |   [4] weight <= 2275
## |   |   |   |   [5] weight <= 2004: 6386.500 (n = 24, err = 12759550.0)
## |   |   |   |   [6] weight > 2004: 7794.367 (n = 30, err = 15484561.0)
## |   |   |   [7] weight > 2275: 10288.086 (n = 35, err = 181506896.7)
## |   |   [8] weight > 2540
## |   |   |   [9] cyl_six <= 0
## |   |   |   |   [10] width <= 66.9: 13823.727 (n = 22, err = 135640364.4)
## |   |   |   |   [11] width > 66.9: 16847.545 (n = 22, err = 219503325.5)
## |   |   |   [12] cyl_six > 0: 18952.923 (n = 13, err = 104305834.9)
## |   [13] engine_size > 181: 36964.500 (n = 10, err = 174348546.5)
## 
## Number of inner nodes:    6
## Number of terminal nodes: 7
plot(ctree_0)

# predicting on train and test set
train_tree<-predict(tree_0)
test_tree<-predict(tree_0, newdata = whole_data$test,type = 'vector')

df_fit<-whole_data$train[, .(id=1:.N,price, train_tree)]
str(df_fit)
## Classes 'data.table' and 'data.frame':   156 obs. of  3 variables:
##  $ id        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ price     : num  7395 6855 8949 11549 8238 ...
##  $ train_tree: num  6596 6094 9743 13017 8032 ...
##  - attr(*, ".internal.selfref")=<externalptr>
df_pred<-whole_data$test[, .(id=1:.N,price, test_tree)]
str(df_pred)
## Classes 'data.table' and 'data.frame':   39 obs. of  3 variables:
##  $ id       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ price    : num  17450 18920 30760 5151 6377 ...
##  $ test_tree: num  13017 20579 36964 6094 6094 ...
##  - attr(*, ".internal.selfref")=<externalptr>
p1<-ggplot(melt(df_fit, id.vars = 'id'), aes(x=id,y=value, colour=variable))+
  geom_point(alpha=0.65)+geom_line(alpha=0.65)+
  ylim(0,50000)+xlab('')+ylab('$')+
  ggtitle('Regression Tree - Train Prediction on Automobile Price')+
  scale_colour_manual(values = c('black','red'))

p2<-ggplot(melt(df_pred, id.vars = 'id'), aes(x=id,y=value, colour=variable))+
  geom_point(alpha=0.65)+geom_line(alpha=0.65)+
  ylim(0,50000)+xlab('')+ylab('$')+
  ggtitle('Regression Tree - Test Prediction on Automobile Price')+
  scale_colour_manual(values = c('black','blue'))

library(gridExtra)
grid.arrange(p1,p2, ncol=1)

# Calculating the performance metrics on test set
rmse_tree<-rmse(real=whole_data$test$price, predicted = test_tree); rmse_tree
## [1] 3851.357
mae_tree<-mae(real=whole_data$test$price, predicted = test_tree); mae_tree
## [1] 2703.175
mape_tree<-mape(real=whole_data$test$price, predicted = test_tree); mape_tree
## [1] 0.1915807
# compare with train
mape(whole_data$train$price,train_tree)
## [1] 0.09251727